library(RSQLite)
library(foreign)
library(feather)
library(xtable)
library(xlsx)
library(data.table)
library(plyr)
library(dplyr)
library(stringr)
library(splitstackshape)
require("stringi")
require(stringdist)
require(RecordLinkage)
library(doParallel)
library(phonics)
library(reshape2)
require(zoo)
library(tm)
library(tabulizer)
library(lattice)
library(latticeExtra)
library(extrafont)
library(RColorBrewer)
library(classInt)
library(grid)
library(rgdal)
library(raster)
library(spatstat)
library(doSNOW)

rm(list=ls())

##### Functions #####

findClosest <- function(x, s) {
  ##First use Jaro distances:
  dist <- stringdist::stringdist(x, s, method = "jw", p = 0.1)
  slct <- suppressWarnings(which(dist == min(dist, na.rm = TRUE)))
  
  if(length(slct) == 0)
    return(as.integer(NA))
  
  ##If multiple matches select the one with closest Levenshtein distance:
  if(length(slct) > 1) {  
    dist <- stringdist::stringdist(x, s[slct], method = "lv")
    slct2 <- which(dist == min(dist, na.rm = TRUE))
    if(length(slct2) > 1) 
      return(as.integer(NA))
    if(dist[slct2] > 4) ##Set max number of transformations
      return(as.integer(NA))
    return(slct[slct2])
  }
  if(dist[slct] > .3) ##Set max Jaro score 
    return(as.integer(NA))
  return(slct)
}
OrderMatches <- function(string, stringVector, method = "lv"){
  order(stringdist(string, stringVector, method = method))
}
hsanitize <- function(x){
  gsub("\\\\hline", "", sub("\\hline", "\\toprule ", x))
}
Getwords <- function(x) { unlist(strsplit(x, "[[:punct:] ]")) }

PMatches <- function(string1, string2) { 
  sum(Getwords(string1) %in% Getwords(string2)) / length(Getwords(string1))
}
digit <- function(x, d = 3) {
  a <- format(round(as.numeric(x), d), nsmall = d)
  a[is.na(a) | grepl("NA", a)] <- ""
  a
}
matchNAown <- function(x, tab) { 
  t <- match(x, tab)
  t[which(is.na(t))] <- which(is.na(t))
  t
}

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

##Number of cores to be used:
ncores <- detectCores()
if(ncores < 2) { 
    ncores <- 1
} else
    ncores <- ncores - 2
cl <- makeCluster(ncores)
registerDoParallel(cl)

######################################################################################################
### Aim: intergenerational mobility
### Datasets used: icem.sqlite3 (raw data from ICeM project at UK Data Archive)
### Application for the Special Licence request: https://icem.data-archive.ac.uk/#step1
### Data created: Dataset_1851_1881
######################################################################################################

## Folder with ICEM raw dataset 
setwd("/hdd1/Dropbox/SocialMobility_Local/ICEM")
db <- dbConnect(SQLite(), dbname="icem.sqlite")

# Folder A_CREATING_SAMPLE
fldr_a = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/A_CREATING_SAMPLE"

# Folder C_ANALYSIS
fldr_c = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/C_ANALYSIS/"

## NAPP Folder
fldr_napp = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/NAPP/"

## Temp folder with intermediary dataset 
fldr = ""

######################################################################################################
### LINK INDIVIDUALS ACROSS 1851-1881 CENSUSES
######################################################################################################

### STEP 1: BLOCKING # 1 ON BIRTH COUNTY, FIRST LETTER OF FIRST AND LAST NAMES

## 1851
ds <- dbGetQuery(db, "SELECT Year, RecID, ParID, ConParID, Country, Division, RegCnty, RegDist, SubDist, RC, RD, RSD, Parish, H, PID, Sex, Age, Std_Par, BpCnty, Cnti, BpCtry, Ctry, H_Sname,SameName, Rela FROM census1851")
ds$RecID <- as.numeric(ds$RecID)

ds2 <- dbGetQuery(db, "SELECT * FROM census1851b")

# keep: men 
ds$Age = as.numeric(ds$Age)
ds <- ds[ds$Sex=="M",]
# keep : son, grandson...
ds <- ds[ds$Rela==30 | ds$Rela==31 | ds$Rela==33 | ds$Rela==35 | ds$Rela==60 | ds$Rela==61 | ds$Rela==63 | ds$Rela==65 | ds$Rela==80 | ds$Rela==81,]
ds <- ds[ds$Age<=60, ]
ds2 <- ds2[ds2$RecID %in% ds$RecID, ]

# merge restricted and unrestricted data
ds_1851 <- merge(ds, ds2, by=c("Year", "RecID"), all=TRUE)
ds_1851 <- ds_1851[ds_1851$Year!=0,]
ds_1851 <- ds_1851[ds_1851$Year>1000,]
rm(ds, ds2)
ds_1851 <- ds_1851[!is.na(ds_1851$Sname),]
ds_1851 <- ds_1851[!is.na(ds_1851$Age),]

# Get first letter of first/last name
ds_1851$Pname <- trim(ds_1851$Pname)
ds_1851$Sname <- trim(ds_1851$Sname)
ds_1851$f_first = substr(ds_1851$Pname, start=1, stop=1)
ds_1851$f_last = substr(ds_1851$Sname, start=1, stop=1)

ds_1851 <- ds_1851[, c("Year","RecID","ParID","ConParID","Country","Division","RegCnty","RegDist","SubDist",
                       "RC","RD","RSD","Parish","Age","Std_Par","BpCnty","Cnti","BpCtry","Ctry",
                       "H_Sname","SameName","Pname","Oname","Sname", "f_first", "f_last")]

# Same for 1881
ds <- dbGetQuery(db, "SELECT Year, RecID, ParID, ConParID, Country, Division, RegCnty, RegDist, SubDist, RC, RD, RSD, Parish, H, PID, Sex, Age, Std_Par, BpCnty, Cnti, BpCtry, Ctry, H_Sname,SameName, Rela FROM census1881")
ds$RecID <- as.numeric(ds$RecID)

ds2 <- dbGetQuery(db, "SELECT * FROM census1881b")

ds <- ds[ds$Sex=="M",]
ds2 <- ds2[ds2$RecID %in% ds$RecID, ]

ds_1881 <- merge(ds, ds2, by=c("Year", "RecID"), all=TRUE)
rm(ds, ds2)
ds_1881 <- ds_1881[!is.na(ds_1881$Sname),]

ds_1881$Pname <- trim(ds_1881$Pname)
ds_1881$Sname <- trim(ds_1881$Sname)
ds_1881$f_first = substr(ds_1881$Pname, start=1, stop=1)
ds_1881$f_last = substr(ds_1881$Sname, start=1, stop=1)

ds_1881 <- ds_1881[, c("Year","RecID","ParID","ConParID","Country","Division","RegCnty","RegDist","SubDist",
                       "RC","RD","RSD","Parish","Age","Std_Par","BpCnty","Cnti","BpCtry","Ctry",
                       "H_Sname","SameName","Pname","Oname","Sname", "f_first", "f_last")]


# Append 1851-1881
data <- rbind(ds_1851, ds_1881)

# Create block ID on county of birth, first letter first/last name 
data<-within(data,{group<-as.numeric(factor(paste0(Cnti,f_first,f_last)))}) 

# Keep blocks where there are observations in 1851 and 1881 in the same block ID
data = as.data.table(data)[, size := uniqueN(Year), by = group]
data <- data[data$size==2,]

db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")
dbWriteTable(db2, data, name="block1_5181", append=TRUE)

### STEP 2: BLOCKING 2 BASED ON STRING DISTANCES

db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")

raw <- dbGetQuery(db2, "SELECT * FROM block1_5181")
raw = data.frame(raw)
raw$Year = as.numeric(raw$Year)
raw$Age = as.numeric(raw$Age)
raw <- raw[!is.na(raw$Age),]
raw$Year_of_Birth = raw$Year - raw$Age

dbDisconnect(db2)

list <- as.list(unique(raw$group))

for (j in 1:length(list)) {
  
  db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")
  
  temp <- raw[raw$group==list[j],]
  A <- temp[which(temp$Year==1851),]
  B <- temp[which(temp$Year==1881),]
  rm(temp)
  
  #Count the number of observations in the source dataset
  n<-nrow(A)
  
  #Initialize empty lists
  strdist_FN<-list()
  strdist_LN<-list()
  strdist_BP<-list()
  agedist<-list()
  index<-list()
  n_obs<-list()
  
  # I now loop over each of the observations in the source dataset A. 
  # the line that starts with "index" identifies, for each observation in A, which observations in B belong to the same block
  # the lines that start with strdist_FN and strdist_LN compute the string distance of each observation in A with respect to each observation in B that belongs to its same block (this step is aimed at saving computational time)
  
  #Drop those that do not have a match within a five years window
  n_obs <- foreach(i=1:n) %dopar% length(which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5))
  A <- A[which(unlist(n_obs)>0),]
  n_obs<-unlist(n_obs)[which(unlist(n_obs)>0)]
  
  index<-list()
  
  n<-length(n_obs)
  
  #index_B identifies the position or the observations in dataset B that I copmare to each observation in dataset A
  
  index_B<-foreach(i=1:n) %dopar% which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5)
  
  #Create distances in age, first and last names, parish 
  
  agedist<-foreach(i=1:n) %dopar% abs(c(A[i,"Year_of_Birth"]-B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Year_of_Birth"]))
  
  strdist_LN<-foreach(i=1:n) %dopar% c(stringdist(A[i,"Sname"],B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Sname"],method="jw",p=0.1))
  
  strdist_FN<-foreach(i=1:n) %dopar% c(stringdist(A[i,"Pname"],B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Pname"],method="jw",p=0.1))
  
  strdist_BP<-foreach(i=1:n) %dopar% c(stringdist(A[i,"Std_Par"],B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Std_Par"],method="jw",p=0.1))
  
  soundex_LN<-foreach(i=1:n) %dopar% c(ifelse(soundex(A[i,"Sname"])==soundex(B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Sname"]), 1, 0))
  
  soundex_FN<-foreach(i=1:n) %dopar% c(ifelse(soundex(A[i,"Pname"])==soundex(B[which(abs(B$Year_of_Birth-A[i,"Year_of_Birth"])<=5),"Pname"]), 1, 0))
  
  agedist<-unlist(agedist)
  strdist_LN<-unlist(strdist_LN)
  strdist_FN<-unlist(strdist_FN)
  strdist_BP<-unlist(strdist_BP)
  soundex_LN<-unlist(soundex_LN)
  soundex_FN<-unlist(soundex_FN)
  
  index_A<-c()
  if (length(unlist(index_B))>0){  
    for (i in 1:n){
      index_A<-append(index_A,rep(i,n_obs[i]))
    }
    
    data<-cbind(A[index_A,"RecID"], A[index_A, "group"], A[index_A,"Pname"], A[index_A,"Sname"], A[index_A,"Std_Par"], A[index_A, "Cnti"], A[index_A,"Year_of_Birth"], B[unlist(index_B),"RecID"], B[unlist(index_B),"group"], B[unlist(index_B),"Pname"],B[unlist(index_B),"Sname"], B[unlist(index_B),"Std_Par"], B[unlist(index_B),"Cnti"], B[unlist(index_B),"Year_of_Birth"], agedist,strdist_LN, strdist_FN, strdist_BP, soundex_LN, soundex_FN)
    
    colnames(data)<-c("RecID_51", "group_51", "First_Name_51", "Last_Name_51", "Std_Par_51", "Cnti_51", "Year_of_Birth_51", "RecID_81", "group_81", "First_Name_81","Last_Name_81", "Std_Par_81", "Cnti_81", "Year_of_Birth_81", "Age_Dist","Dist_LN","Dist_FN", "Dist_parish", "soundex_LN", "soundex_FN")
    
    data <- data.frame(data)
    
    dbWriteTable(db2, data, name="block2_5181", append=TRUE)
  }
  
  print(j)
  dbDisconnect(db2)
  
}

### STEP 3: EM Algorithm
### The goal is to use the data to estimate the likelihood that each distance corresponds to a true match, even though we do not know for sure
### which records are a true match and which records are a non-match
### EM algorithm is necessary to estimate the parameters because we do not observe true match status which 
### makes the direct max of the likelihood function complicated computationally

### 1. assume that distances between records follow a particular type of distribution
### allowing two different distributions for matches and non-matches
### with probability (p_M, p_U), distances are distributed normally with mean (mu_M, mu_U) and s.d. (sigma_M, sigma_U)
### 2. Use ML to estimate parameters (p_M, p_U, mu_M, mu_U, sigma_M, sigma_U) based on vector of distances in the identifying variables (in our case age, first/last names and parish of birth)
### Use ML to find parameters of the statistical distribution that max the likelihood of observing the observed distances
### and use parameters to identify two clusters (one from which true matches are more likely and one from which non-matches are more likely)
### The further apart mu_M is from mu_U (sigma_M and sigma_U are small), the more confident we can distinguish matches and non-matches
### Ex. We know Pr(dist_age) = Pr(dist_age in M) * p_M + Pr(dist_age in U) * (1 - p_M)
### We assume a statistical distribution for Pr(dist_age in M) and Pr(dist_age in U). 
### 3. Compute the probability that a pair of observations is a match given the observed distances in identifying variables 
### Pr(dist_age in M | obs distances)

db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")

dbGetQuery(db2, "SELECT count(*) FROM block2_5181")
# 2,279,301,187
dbGetQuery(db2, "SELECT count(*) FROM block2_5181 WHERE Dist_LN<=0.3 AND Dist_FN<=0.3")
# 298,653,058

# Identify county of birth: 
raw <- dbGetQuery(db2, "SELECT * FROM block1_5181")
raw = data.frame(raw)
county = as.list(unique(raw$Cnti))

for (county_no in 1:length(county)){
  
  print(paste(county_no, " - ", county[county_no]))
  
  db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")
  
  data <- dbGetQuery(db2, paste("SELECT * FROM block2_5181 WHERE Dist_LN<=0.3 AND Dist_FN<=0.3 AND Cnti_51=\"", county[county_no], "\"", collapse="",sep=""))
  
  if (nrow(data)>2){
    
    n <- length(unique(data[,"RecID_51"]))
    n_obs <- length(data[,"RecID_51"])

    strdist_FN<-data$Dist_FN
    strdist_LN<-data$Dist_LN
    agedist<-as.numeric(data$Age_Dist)

    # Dictionary for the string distance values for first/last names
    # Winkler uses for the (1-jw) [0, 0.6], (0.66,0.88], (0.88, 0.94], and (0.94,1] 
    # Create 4 categories for strings + 0
    dic_strdist<-function(x){
        ifelse(x <= 0.067,1,0) + ifelse(x > 0.067 & x <= 0.12,2,0) + ifelse(x > 0.12 & x <= 0.25,3,0) + ifelse(x > 0.25,4,0)
    }

    # Convert string distance variables vector to vector with just the corresponding index
    strdist_FN_index <- dic_strdist(strdist_FN)
    strdist_LN_index <- dic_strdist(strdist_LN)

    ### EM Algorithm

    # Guess the unconditional probability to be a true match 
    # Initial value for proportion of matches (M) and unmatched (U)
    p_M_0 <- n/n_obs
    p_U_0 <- 1-p_M_0

    # String distribution: multinomial 

    # Set number of categorical variables (k) for names and ages
    # There are 4 groups defined below by the dic_strdist functions
    # There are 6 groups for age differences (0,1,2,3,4,5) - max 5 after block1
    # There are 2 groups for parish of birth 
    k_N <- 4
    k_A <- 6
    i_N <- 1:k_N
    i_A <- 1:k_A

    # Guess initial values for parameters of the conditional distribution (theta_0s)
    # Priors are set equal to 1/k for the unmatched and to 1/k + [(k-1)/2 - (i-1)]/(k^2) for the matched

    # First name priors 
    # for the unmatched assume no further information provided by knowing that unmatched. 
    # for the matched impose a decreasing LR 
    theta_strdist_FN_U_0 <- tabulate(strdist_FN_index)/length(strdist_FN_index)
    theta_strdist_FN_M_0 <- (1/k_N+((k_N-1)/2-(i_N-1))/k_N^2)

    # Last name priors 
    # my prior is that the Pr(Distance/U) is approx the empirical frequencies of each distance
    theta_strdist_LN_U_0 <- tabulate(strdist_LN_index)/length(strdist_LN_index)
    theta_strdist_LN_M_0 <- (1/k_N+((k_N-1)/2-(i_N-1))/k_N^2)

    # Age distribution
    # Multinomial over 0,1,2,3,4,5 absolute age difference
    theta_agedist_U_0 <- tabulate(agedist+1)/length(agedist)
    #print(agedist)
    theta_agedist_M_0 <- (1/k_A+((k_A-1)/2-(i_A-1))/k_A^2)

    # Start loop
    t<-1
    error<-10
    error_v<-vector()
    iter<-500
    #iter<-5000

    # Data: age distribution, string distribution in first/last name
    dataagg <- cbind(c(1:length(agedist)),agedist,strdist_FN_index, strdist_LN_index)
    # Count number of observations for each possible combination (age, first/last name distributions)
    dataagg <- aggregate(dataagg[,1], by=list(agedist,strdist_FN_index, strdist_LN_index), FUN=length)

    rm(data)

    while(error>0.001 & t<iter){
  
        # Pr(distance/parameters)
        # add 1 become of 0 from perfect match
        p_age_M<-theta_agedist_M_0[dataagg[,1]+1]
        p_age_U<-theta_agedist_U_0[dataagg[,1]+1]
  
        p_str_FN_M<-theta_strdist_FN_M_0[dataagg[,2]]
        p_str_FN_U<-theta_strdist_FN_U_0[dataagg[,2]]
  
        p_str_LN_M<-theta_strdist_LN_M_0[dataagg[,3]]
        p_str_LN_U<-theta_strdist_LN_U_0[dataagg[,3]]
  
        # Expectation step: given theta_M_0 and p_M_0 infer w using Bayes's rule
        w <- (p_str_FN_M*p_str_LN_M*p_age_M*p_M_0)/((p_str_FN_M*p_str_LN_M*p_age_M*p_M_0)+(p_str_FN_U*p_str_LN_U*p_age_U*p_U_0))
  
        # Maximization step
        # ML estimates are 
        # (1) p_M_1 = (1/N) sum w
        p_M_1<-weighted.mean(w,dataagg[,4])
        p_U_1<-1-p_M_1
  
        theta_agedist_M_1<-vector()
        theta_agedist_U_1<-vector()
  
        theta_strdist_FN_M_1<-vector()
        theta_strdist_FN_U_1<-vector()
  
        theta_strdist_LN_M_1<-vector()
        theta_strdist_LN_U_1<-vector()
  
        # (2) theta_M = max(sum w*log Pr(theta | obs))
        for (i in 1:k_A){
            theta_agedist_M_1[i]<-(w[which(dataagg[,1]==i-1)]%*%dataagg[which(dataagg[,1]==i-1),4])/(w%*%dataagg[,4])  
            theta_agedist_U_1[i]<-((1-w[which(dataagg[,1]==i-1)])%*%dataagg[which(dataagg[,1]==i-1),4])/((1-w)%*%dataagg[,4])
        }
  
        for (i in 1:k_N){
            theta_strdist_FN_M_1[i]<-(w[which(dataagg[,2]==i)]%*%dataagg[which(dataagg[,2]==i),4])/(w%*%dataagg[,4])
            theta_strdist_FN_U_1[i]<-((1-w[which(dataagg[,2]==i)])%*%dataagg[which(dataagg[,2]==i),4])/((1-w)%*%dataagg[,4])
    
            theta_strdist_LN_M_1[i]<-(w[which(dataagg[,3]==i)]%*%dataagg[which(dataagg[,3]==i),4])/(w%*%dataagg[,4])
            theta_strdist_LN_U_1[i]<-((1-w[which(dataagg[,3]==i)])%*%dataagg[which(dataagg[,3]==i),4])/((1-w)%*%dataagg[,4])
        }
  
        # Check difference in absolute value
  
        t<-t+1
  
        error1<-max(abs(rbind(theta_agedist_M_1,theta_agedist_U_1)-rbind(theta_agedist_M_0,theta_agedist_U_0)))
        error2<-max(abs(rbind(theta_strdist_FN_M_1,theta_strdist_FN_U_1,theta_strdist_LN_M_1,theta_strdist_LN_U_1)-rbind(theta_strdist_FN_M_0,theta_strdist_FN_U_0,theta_strdist_LN_M_0,theta_strdist_LN_U_0)))
        error3<-abs(p_M_1-p_M_0)
        error<-max(error1,error2,error3)
        error_v[t]<-error
        error <- ifelse(is.na(error), 0, error)
        
        # Update values
  
        theta_agedist_M_0<-theta_agedist_M_1
        theta_agedist_U_0<-theta_agedist_U_1
  
        theta_strdist_FN_M_0<-theta_strdist_FN_M_1
        theta_strdist_FN_U_0<-theta_strdist_FN_U_1
  
        theta_strdist_LN_M_0<-theta_strdist_LN_M_1
        theta_strdist_LN_U_0<-theta_strdist_LN_U_1
  
        p_M_0<-p_M_1
        p_U_0<-p_U_1
  
        print(t)
        }

    # Save estimates 
    
    parameters<-c(theta_agedist_M_0,theta_agedist_U_0,theta_strdist_FN_M_0,theta_strdist_FN_U_0,theta_strdist_LN_M_0,theta_strdist_LN_U_0,p_M_0)
    names<-c("AM0","AM1","AM2","AM3","AM4","AM5","AU0","AU1","AU2","AU3","AU4","AU5","FM0","FM1","FM2","FM3","FU0","FU1","FU2","FU3","LM0","LM1","LM2","LM3","LU0","LU1","LU2","LU3","P")
    parameters <- cbind(names, parameters, county[county_no])
    parameters <- data.frame(parameters)
    names(parameters) <- c("names", "parameters", "county")
    parameters$names = as.character(parameters$names)
    parameters$parameters = as.numeric(parameters$parameters)
    parameters$county = as.character(parameters$county)
    
    db3 <- dbConnect(SQLite(), dbname="EM_5181.sqlite")
    dbWriteTable(db3, parameters, name="EM_5181", append=TRUE)
    dbDisconnect(db3)
    print("done")
  }
}


### STEP 4: Final Choice

# load dataset + EM parameters
db3 <- dbConnect(SQLite(), dbname="EM_5181.sqlite")

parameters_raw <- dbGetQuery(db3, "SELECT * FROM EM_5181")
parameters_raw$parameters <- as.numeric(parameters_raw$parameters)

# Identify county of birth: 
db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")
raw <- dbGetQuery(db2, "SELECT * FROM block1_5181")
raw = data.frame(raw)
county = as.list(unique(raw$Cnti))
all = data.frame(data)

for (county_no in 1:length(county)){
  
  print(paste(county_no, " - ", county[county_no]))
  
  parameters <- parameters_raw[parameters_raw$county==county[county_no],]
  
  if (nrow(parameters)>0){
    
  db2 <- dbConnect(SQLite(), dbname="social_mobility5181.sqlite")
  
  data <- dbGetQuery(db2, paste("SELECT * FROM block2_5181 WHERE Dist_LN<=0.3 AND Dist_FN<=0.3 AND Cnti_51=\"", county[county_no], "\"", collapse="",sep=""))
  
  # Create dummies for the categories of string distances
  data$Dist_FN_0 <- ifelse(data$Dist_FN<=0.067, 1, 0)
  data$Dist_FN_1 <- ifelse(data$Dist_FN>0.067 & data$Dist_FN<=0.12, 1, 0)
  data$Dist_FN_2 <- ifelse(data$Dist_FN>0.12 & data$Dist_FN<=0.25, 1, 0)
  data$Dist_FN_3 <- ifelse(data$Dist_FN>0.25, 1, 0)
  
  data$Dist_LN_0 <- ifelse(data$Dist_LN<=0.067, 1, 0)
  data$Dist_LN_1 <- ifelse(data$Dist_LN>0.067 & data$Dist_LN<=0.12, 1, 0)
  data$Dist_LN_2 <- ifelse(data$Dist_LN>0.12 & data$Dist_LN<=0.25, 1, 0)
  data$Dist_LN_3 <- ifelse(data$Dist_LN>0.25, 1, 0)
  
  # Create dummies for age distance
  data$Age_Dist_0 <- ifelse(data$Age_Dist==0, 1, 0)
  data$Age_Dist_1 <- ifelse(data$Age_Dist==1, 1, 0)
  data$Age_Dist_2 <- ifelse(data$Age_Dist==2, 1, 0)
  data$Age_Dist_3 <- ifelse(data$Age_Dist==3, 1, 0)
  data$Age_Dist_4 <- ifelse(data$Age_Dist==4, 1, 0)
  data$Age_Dist_5 <- ifelse(data$Age_Dist==5, 1, 0)
  
  # Create dummies for parish
  #data$Parish_0 <- ifelse(data$Dist_parish <= 0.05,1,0)
  #data$Parish_1 <- ifelse(data$Dist_parish > 0.05,1,0)
  
  # Generate matching score
  # Pr (Obs in M | parameters thetas)
  data$w_M_A <- data$Age_Dist_0*parameters[parameters$names=="AM0", "parameters"]+data$Age_Dist_1*parameters[parameters$names=="AM1", "parameters"]+data$Age_Dist_2*parameters[parameters$names=="AM2", "parameters"]+data$Age_Dist_3*parameters[parameters$names=="AM3", "parameters"]+data$Age_Dist_4*parameters[parameters$names=="AM4", "parameters"]+data$Age_Dist_5*parameters[parameters$names=="AM5", "parameters"]
  data$w_M_FN <- data$Dist_FN_0*parameters[parameters$names=="FM0", "parameters"]+data$Dist_FN_1*parameters[parameters$names=="FM1", "parameters"]+data$Dist_FN_2*parameters[parameters$names=="FM2", "parameters"]+data$Dist_FN_3*parameters[parameters$names=="FM3", "parameters"]
  data$w_M_LN <- data$Dist_LN_0*parameters[parameters$names=="LM0", "parameters"]+data$Dist_LN_1*parameters[parameters$names=="LM1", "parameters"]+data$Dist_LN_2*parameters[parameters$names=="LM2", "parameters"]+data$Dist_LN_3*parameters[parameters$names=="LM3", "parameters"]
  
  data$w_U_A <- data$Age_Dist_0*parameters[parameters$names=="AU0", "parameters"]+data$Age_Dist_1*parameters[parameters$names=="AU1", "parameters"]+data$Age_Dist_2*parameters[parameters$names=="AU2", "parameters"]+data$Age_Dist_3*parameters[parameters$names=="AU3", "parameters"]+data$Age_Dist_4*parameters[parameters$names=="AU4", "parameters"]+data$Age_Dist_5*parameters[parameters$names=="AU5", "parameters"]
  data$w_U_FN <- data$Dist_FN_0*parameters[parameters$names=="FU0", "parameters"]+data$Dist_FN_1*parameters[parameters$names=="FU1", "parameters"]+data$Dist_FN_2*parameters[parameters$names=="FU2", "parameters"]+data$Dist_FN_3*parameters[parameters$names=="FU3", "parameters"]
  data$w_U_LN <- data$Dist_LN_0*parameters[parameters$names=="LU0", "parameters"]+data$Dist_LN_1*parameters[parameters$names=="LU1", "parameters"]+data$Dist_LN_2*parameters[parameters$names=="LU2", "parameters"]+data$Dist_LN_3*parameters[parameters$names=="LU3", "parameters"]
  
  data$w <- (data$w_M_A*data$w_M_FN*data$w_M_LN)*parameters[parameters$names=="P", "parameters"]/((data$w_M_A*data$w_M_FN*data$w_M_LN)*parameters[parameters$names=="P", "parameters"]+(data$w_U_A*data$w_U_FN*data$w_U_LN)*(1-parameters[parameters$names=="P", "parameters"]))
  
  max_w <- max(data$w, na.rm=TRUE)
  
  data$w <- data$w / max_w
  
  data$RecID_51 <- as.numeric(data$RecID_51)
  data$RecID_81 <- as.numeric(data$RecID_81)
  
  ## Identify the best match for any given observation in data set A
  data <- data %>%
    group_by(RecID_51) %>%
    mutate(best_match_51 = max(w, na.rm=TRUE))
  
  data <- data %>%
    group_by(RecID_81) %>%
    mutate(best_match_81 = max(w, na.rm=TRUE))
  
  data <- data.frame(data)
  
  data <- data[data$best_match_51==data$w | data$best_match_81==data$w, ]

  data$Potential_Match_51 <- ifelse(data$w>=(60/100) & data$best_match_51==data$w, 1, 0)
  
  data$Potential_Match_81 <- ifelse(data$w>=(60/100) & data$best_match_81==data$w, 1, 0)
  
  data <- data %>%
    group_by(RecID_51) %>%
    mutate(n_matches_51 = sum(Potential_Match_51))
  
  data <- data %>%
    group_by(RecID_81) %>%
    mutate(n_matches_81 = sum(Potential_Match_81))
  
  data <- data.frame(data)
  
  data <- data[data$Potential_Match_51==1 & data$Potential_Match_81==1 & data$n_matches_51==1 & data$n_matches_81==1 & data$best_match_51==data$w & data$best_match_81==data$w,]
  data <- data[!is.na(data$RecID_51),]
  
  all = rbind(all, data)
  
  }
  print("done")
  
}

db4 <- dbConnect(SQLite(), dbname="link_5181.sqlite")
dbWriteTable(db4, all, name="link_M1", append=TRUE, row.names=TRUE)
dbDisconnect(db4)

### Reshape resulting linked dataset 

db2 <- dbConnect(SQLite(), dbname="link_5181.sqlite")
m1 <- dbGetQuery(db2, "SELECT RecID_51, RecID_81 FROM link_M1")
m1$link_id <- 1:nrow(m1)

m1 <- melt(m1, id.vars="link_id")
m1$Year = ifelse(m1$variable=="RecID_51", 1851, 1881)
m1 <- m1[, c("link_id", "value", "Year")]
names(m1) <- c("link_id", "RecID", "Year")
m1 <- m1[order(m1$link_id),]

length(unique(m1$RecID[m1$Year==1851]))
# 569152
length(unique(m1$RecID[m1$Year==1881]))
# 569152

db5 <- dbConnect(SQLite(), dbname="link_5111.sqlite")
dbWriteTable(db5, all, name="data", overwrite=TRUE, row.names=TRUE)

##########################################################################
#### ADD RAW DATA
### Data created: links_withHHmen

db5 <- dbConnect(SQLite(), dbname="link_5111.sqlite")
all <- dbGetQuery(get("db5"), "SELECT * FROM data")
setDT(all)

tab <- list()
for(y in c(1851, 1881)) {
  
  ## Select the linked for census y:
  blink <- all[Year == y]
  
  ## Merge with original census data 
  base <- dbGetQuery(get("db"), paste0("SELECT * FROM census", y))
  setDT(base)
  base <- unique(base)
  gc()
  
  ## Record average household size, as delimited by H
  tmp <- mean(unlist(base[, .N, by = H][, 2]))
  tmp <- c(tmp,mean(unlist(base[, (.N != uniqueN(PID)), by = H][, 2])))
  
  ## create household ID
  ## 1) Locate every time there is a break on monotonicity of PID
  ##IMPORTANT: do not reorder before this!
  base[, PID := as.numeric(PID)]
  base[, splits := PID < c(NA, head(PID, -1))]
  base[is.na(splits), splits := FALSE]
  base[, splits := cumsum(splits)]
  
  ## 2) Every split is a new household,
  ##Further divide them by using indentifier H
  ##and making sure individuals in
  ##household share the same address:
  ##(Now reordering is fine)
  setorder(base, splits, H, SubDist, Parish, PID)
  base[, hh_id := (1:.N == 1) * 1., by = c("splits", "H", "SubDist", "Parish")]
  base[, hh_id := cumsum(hh_id)]
  
  ## How many splits and hh_id coincide?
  setorder(base, hh_id, PID)
  tmp <- c(tmp, base[, uniqueN(hh_id) / uniqueN(H)])
  
  tab <- c(tab, list(tmp))
  
  ## Signal those linked
  base[, RecID := as.numeric(RecID)]
  base <- blink[, .(RecID, linked = 1)][base, on = "RecID"]
  gc()
  base[is.na(linked), linked := 0]
  
  cat("Number of Individuals in blink =", uniqueN(blink$RecID), "\n")
  cat("Number of linked individuals in base = ", uniqueN(base[linked == 1, RecID]), "\n")
  
  ## merge making sure to keep everyone in their household
  base[, Year := as.numeric(Year)]
  ## This merge keeps all individuals in base and drops
  ## those in blink that are not in base:
  base <- blink[base, on = c("Year", "RecID")]
  ## Keep households with at least one link:
  base[, household := sum(linked), by = hh_id]
  
  base <- base[household >= 1] 
  base[, c("row_names", "splits") := NULL]
  rm(blink)
  gc()
  
  ## Get address
  address <- dbGetQuery(get("db"), paste0("SELECT * FROM census", y, "b"))
  setDT(address)
  address <- address[RecID %in% base$RecID]
  gc()
  
  ## Merge keeping all in base:
  base <- address[base, on = c("Year", "RecID")]
  
  ## remove people with no relationship
  base[, Rela := as.numeric(Rela)]
  base <- base[Rela != 9999]
  gc()
  
  ## remove people with no address:
  base <- base[!is.na(Address)]
  gc()
  
  ## remove visitors, inmates, 
  base <- base[Rela != 2000]
  gc()
  base <- base[Rela < 2010]
  gc()
  
  dbWriteTable(db5, base, name = paste0("data", y), overwrite=TRUE, row.names=TRUE)
  rm(base)
  gc()
  
  print(paste("Census", y, "merge done"))
}

tab2 <- list()
dat <- data.table()

for(y in c(1851, 1881)) {
  
  base <- dbGetQuery(db5, paste0("SELECT * FROM data", y))
  setDT(base)
  
  ## Make sure households share the same address:
  setorder(base, hh_id, Address)
  ## Majority (99%) of them do. Else split Household:
  base[, split := (1:.N == 1) * 1., by = c("hh_id", "Address")]
  base[, split := cumsum(split)]
  
  ## How many splits and hh_id coincide?
  base[, ":="(new_split = split != c(NA, head(split, -1)),
              new_hh_id = hh_id != c(NA, head(hh_id , -1)))]
  
  tmp <- prop.table(table(base[new_hh_id == TRUE,  new_split]))
  tmp <- tmp[length(tmp)]
  if(tmp < 1)
    warning("Not all hh_id share the same Address!")
  rm(tmp)
  gc()
  
  base[, c("new_split", "new_hh_id") := NULL]
  gc()
  base[, ":="(hh_id = split, split = NULL)]
  gc()
  
  tab2 <- c(tab2, list(mean(unlist(base[, .N, by = hh_id][, 2]))))
  
  ## append censuses
  dat <- rbindlist(list(dat, base), fill = TRUE)
  rm(base)
  gc()
}

## Modify hh_id to be unique in the whole dataset:
setorder(dat, hh_id, Year)
dat[, split := (1:.N == 1) * 1., by = c("hh_id", "Year")]
dat[, hh_id := cumsum(split)]

## Make hh_id to be year ascendent:
setorder(dat, Year, hh_id)
dat[, split := (1:.N == 1) * 1., by = c("hh_id", "Year")]
dat[, hh_id := cumsum(split)]

## Check uniqueness of PID within household
dat[, check := uniqueN(PID) != length(PID), by = hh_id]
if(any(dat$check))
  stop("Non unique PID within hh_id!")
dat[, check := NULL]
gc()

## Assign unique IDs to everyone:
## 1) Assign for linked
setorder(dat, link_id, Year)
dat[!is.na(link_id), ob := .N, by = link_id]
cat("There are", nrow(dat[ob == 1 & !is.na(link_id)]),
    "that have link\\_id but have only one observation (in Households\\_IDs.r). ",
    file = file.path(latexTables, "Warnings.tex"), append = TRUE)
dat[ob == 1, link_id := NA]

dat[!is.na(link_id), person_id := (1:.N == 1) * 1., by = link_id]
dat[!is.na(link_id), person_id := cumsum(person_id)]

## 2) Assign for household members of linked (make sure to start at max ID)
maxId <- max(dat$person_id, na.rm = TRUE)
setorder(dat, hh_id, Year)
dat[is.na(link_id), person_id := (1:.N) * 1.]
dat[is.na(link_id), person_id := person_id + maxId]

## 3) Keep only men of households
write_feather(dat[Sex == "M"], file.path(fldr, "links_withHHmen")) 

##########################################################################
#### CREATE HARMONIZED HOUSEHOLD ID
### Data created: Matching_Set, Father_Son

dat <- read_feather(file.path(fldr, "links_withHHmen"),
                    c("link_id", "person_id", "hh_id", "Father", "PID", "Year", "Age", "Sname", "Pname"))
setDT(dat)

## 1) For each individual, carry the hh_id of his earliest observation
setorder(dat, person_id, Year)
dat[, fhh_id := hh_id[1], by = person_id]

suppressWarnings(dat[, Age := as.numeric(Age)])
setorder(dat, hh_id, person_id)

## 2) Find fathers:
dat[, ":="(father_id = person_id[matchNAown(Father, PID)],
           father_Age = Age[matchNAown(Father, PID)]), by = hh_id]

## 2A) If father_id == person_id then no matched father
##same if son is not at least 12 years younger than father
dat[father_id == person_id | father_Age - Age < 12 |
      is.na(Age) | is.na(father_Age), father_id := NA]

setorder(dat, person_id, Year)
dat[, year_matched := Year[1],by = c("person_id", "father_id")]
dat[, age0 := father_Age[1],by = c("person_id", "father_id")] 
dat[is.na(father_id), c("year_matched", "age0") := NA]

setorder(dat, person_id, Year)
dat[!is.na(father_id), father_idH := father_id[1], by = person_id]

## Save father-son dataset
fwrite(dat[, .(person_id, father_id, father_idH, Year)],
       file.path(fldr, "Father_Son"))
dat[, father_idH := NULL]

## 2B) Carry over missing father_id:
setorder(dat, person_id, Year)
dat[, father_id := na.locf(father_id, na.rm = FALSE, fromLast = FALSE), by = person_id]
dat[, year_matched := na.locf(year_matched, na.rm = FALSE, fromLast = FALSE), by = person_id]
dat[, father_id := na.locf(father_id, na.rm = FALSE, fromLast = TRUE), by = person_id]
dat[, year_matched := na.locf(year_matched, na.rm = FALSE, fromLast = TRUE), by = person_id]

## 2C) Generate a unique father identifier:
fthr <- copy(unique(dat[!is.na(father_id), .(person_id, year_matched, father_id,
                                             father_Year = Year)]))
fthr <- copy(unique(dat[, .(father_id = person_id, father_Sname = Sname,
                            father_Pname = Pname, 
                            father_Year = Year,
                            father_Age = Age,
                            orig_matched = 1)])[fthr,on = c("father_id","father_Year"), nomatch = 0])

## 3) Using sons individual identifier get the first father_id

setorder(fthr, person_id, father_Year)
fthr[, ":="(father_idH = father_id[1],
            father_AgeH = father_Age[1],
            father_SnameH = father_Sname[1],
            father_PnameH = father_Pname[1],
            father_YearH = father_Year[1]), by = person_id]

fthr <- unique(fthr[, .(father_id, father_Sname, father_idH, father_SnameH,
                        father_Pname, father_PnameH, year_matched,
                        father_Year, father_YearH, father_Age, father_AgeH)])
gc()

fthr[is.na(father_PnameH), ":="(father_PnameH = father_Pname,
                                father_SnameH = father_Sname,
                                father_YearH = father_Year,
                                father_AgeH = father_Age,
                                year_matched = 0)]

tpr <- dat[, .(father_id = person_id, father_Sname = Sname,
               father_Pname = Pname, 
               father_Year = Year,
               father_Age = Age)][!fthr, on = c("father_id", "father_Year")]

fthr <- rbindlist(list(fthr, tpr), fill = TRUE)
fthr[is.na(father_idH), ":="(father_idH = father_id,
                             father_SnameH = father_Sname,
                             father_PnameH = father_Pname,
                             father_AgeH = father_Age,
                             father_YearH = father_Year,
                             year_matched = 0)]

## 4) Keep unique cases:
fthr <- unique(fthr[, .(father_Pname,
                        father_Sname,
                        father_Age,
                        father_id,
                        father_Year,
                        father_YearH,
                        father_idH,
                        father_PnameH,
                        father_SnameH,
                        father_AgeH,
                        year_matched)])

setorder(fthr, father_id, father_Year)

fwrite(fthr, file.path(fldr, "Matching_Set"))

######################################################################################################
### FIND THOSE THAT SHARE ANY father_id or father_idH
### Data created: Relation_List.rds
iter <- nrow(fthr)
cl <- makeCluster(10)
registerDoSNOW(cl)

pb <- txtProgressBar(max = iter, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress=progress)

l <- foreach(i = 1:iter, .combine = c,
              .options.snow = opts) %dopar% {
               require(data.table)
               fid <- fthr[i, father_id]
               fidH <- fthr[i, father_idH]
               
               list(list(ob = i,
                         slct = which(fthr[, father_id == fid | father_id == fidH |
                                             father_idH == fid | father_idH == fidH])))
             }

close(pb)
stopCluster(cl)

saveRDS(l, "Relation_List.rds")

######################################################################################################
### APPLY RELATIONSHIP AND CHECK FOR INCONSISTENCIES

a <- readRDS("Relation_List.rds")
setOb <- fread(file.path(fldr, "Matching_Set"))
names(setOb) <- gsub("father_", "", names(setOb))

a <- unlist(a)
a <- data.table(names(a), entry = a)
a[V1 == "ob", match := entry]
a[, match := na.locf(match)]
a <- a[V1 != "ob"]

## For each entry, keep the first match:
setorder(a, entry, match)
a[, keep := (1:.N == 1), by = entry]
a <- a[keep == TRUE]
gc()
a[, c("V1", "keep") := NULL]
gc()

## Use this to indentify individuals:
setOb[, idH2 := a$match]

## Recall information about first match:
setOb[, Age := as.numeric(Age)]
setOb[, AgeH := as.numeric(AgeH)]
setOb[year_matched == 0, YearH := 0]

setorder(setOb, idH2, YearH, Year)
setOb[YearH != 0, ":="(Year0 = YearH[1], Age0 = AgeH[1],
                       Pname0 = PnameH[1], Sname0 = SnameH[1]), by = idH2]
setOb[, slct := which(!is.na(Year0))[1], by = idH2]
setOb[, ":="(Year0 = YearH[slct], Age0 = AgeH[slct],
             Pname0 = PnameH[slct], Sname0 = SnameH[slct]), by = idH2]

## Distance between Name, Surname and Age:
suppressWarnings(setOb[Year >= Year0, age_dist := abs(Age - Age0 + Year0 - Year)])
suppressWarnings(setOb[Year < Year0, age_dist := abs(Year0 - Year - Age0 + Age)])
setOb[, sname_dist := stringdist(Sname, Sname0, method = "jw", p = .1)]
setOb[, pname_dist := stringdist(Pname, Pname0, method = "jw", p = .1)]

setOb[, dist := sname_dist + pname_dist + abs(age_dist) / max(abs(age_dist), na.rm = TRUE)]

## 1) More than one individual in the same year
setorder(setOb, idH2, Year)
setOb[, ob := uniqueN(id), by = c("idH2", "Year")]
setOb[, ob := max(ob), by = idH2]

## Clean inconsistent linkages
setorder(setOb, idH2, Year, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)), by = c("idH2", "Year")]
setOb <- setOb[keep == TRUE]

## 2) Check that one id matches at most with one idH2:
setOb[, ob := 0]
setOb[!is.na(idH2), ob := uniqueN(idH2), by = id]

## Clean inconsistent linkages
##Step one 
setorder(setOb, id, Year, YearH, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)),
      by = c("id", "Year", "YearH")]
setOb <- setOb[keep == TRUE]

##Step two 
setorder(setOb, id, Year, idH, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)),
      by = c("id", "Year")]
setOb <- setOb[keep == TRUE]

##Run some checks:
setOb[, ob := .N, by = c("id", "Year")]
if(any(setOb[, ob > 1]))
  stop("Multiple matches per id Year, Tracking_Generations.r!")

setOb[, ob := .N, by = c("idH2", "Year")]
if(any(setOb[, ob > 1]))
  stop("Multiple matches per idH2 Year, Tracking_Generations.r!")
setOb[, ob := NULL]

######################################################################################################
### CREATE HOMOGENOUS PERSON_ID
### Data created: Homogenised_ID.r

fthr <- unique(fread(file.path(fldr, "Father_Son"))[, .(person_id, father_id, Year)])
fthr <- setOb[, .(person_id = id, Year, person_idH = idH2)][fthr, on = c("person_id", "Year")]
fthr <- setOb[, .(father_id = id, Year, father_idH = idH2)][fthr, on = c("father_id", "Year")]

## 1. Further Matches Have Been Created IF there are observations with multiple person_id and father_idH:
fthr[!is.na(father_idH), ob := uniqueN(father_idH) *
       (uniqueN(person_id) > 1), by = person_idH]

## With the homogeneized set, run the same checks:
name <- read_feather(file.path(fldr, "links_withHHmen"), c("person_id", "Year", "Age", "Sname", "Pname"))
setDT(name)
suppressWarnings(name[, Age := as.numeric(Age)])
names(name)[names(name) == "person_id"] <- "id"

setorder(fthr, person_id, Year)
fthr <- fthr[ob > 1]
fthr[, idH2 := father_idH[1], by = person_idH]
fthr <- name[fthr, on = c("id==father_id", "Year")]

## Get characteristics at initial match: 
fthr[, ":="(SnameH = Sname[1],
            PnameH = Pname[1],
            YearH = Year[1],
            AgeH = Age[1],
            Sname0 = Sname[1],
            Pname0 = Pname[1],
            Year0 = Year[1],
            year_matched = Year[1],
            Age0 = Age[1]), by = idH2]

## Get distance:
suppressWarnings(fthr[, age_dist := abs(Age - Age0 + Year0 - Year)])
fthr[, sname_dist := stringdist(Sname, Sname0, method = "jw", p = .1)]
fthr[, pname_dist := stringdist(Pname, Pname0, method = "jw", p = .1)]
fthr[, dist := sname_dist + pname_dist + abs(age_dist) / max(abs(age_dist), na.rm = TRUE)]

fthr <- fthr[, names(fthr)[names(fthr) %in% names(setOb)], with = FALSE]

setOb <- rbindlist(list(fthr, setOb), fill = TRUE)
rm(fthr)
gc()

## Clean inconsistent linkages
setorder(setOb, idH2, Year, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)), by = c("idH2", "Year")]
setOb <- setOb[keep == TRUE]

## Step one 
setorder(setOb, id, Year, YearH, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)), by = c("id", "Year", "YearH")]
setOb <- setOb[keep == TRUE]

## Step two 
setorder(setOb, id, Year, idH, dist)
setOb[, keep := !(1:.N != 1 & !is.na(idH2)), by = c("id", "Year")]
setOb <- setOb[keep == TRUE]

## Run some checks:
setOb[, ob := .N, by = c("id", "Year")]
if(any(setOb[, ob > 1]))
  stop("Multiple matches per id Year, Tracking_Generations.r!")

setOb[, ob := .N, by = c("idH2", "Year")]
if(any(setOb[, ob > 1]))
  stop("Multiple matches per idH2 Year, Tracking_Generations.r!")
setOb[, ob := NULL]

## 2. Homogeneized person_id:
fthr <- unique(fread(file.path(fldr, "Father_Son"))[, .(person_id, father_id, Year)])
fthr <- setOb[, .(person_id = id, Year, person_idH = idH2)][fthr, on = c("person_id", "Year")]
fthr <- setOb[, .(father_id = id, Year, father_idH = idH2)][fthr, on = c("father_id", "Year")]

## 3. Further Matches Have Been Created IF there are observations with multiple person_id and father_idH:
fthr[!is.na(father_idH), ob := uniqueN(father_idH) *
       (uniqueN(person_id) > 1), by = person_idH]

## At this point there should be no further matches:
if(nrow(fthr[ob > 1]))
  stop("Still further matches!Tracking_Generations.r")
fthr[, ob := NULL]

## Those that still have multiple ids is because they lead to inconsistencies,
##erase them:
fthr[!is.na(father_idH), ob := uniqueN(father_idH), by = person_idH]
fthr[, ob := ifelse(any(!is.na(ob)),
                    max(ob, na.rm = TRUE),
                    0L), by = person_idH]
fthr[ob > 1, person_idH := NA]
fthr[, ob := NULL]

## Fill missing person_idH:
setorder(fthr, person_idH, person_id, Year)
fthr[is.na(person_idH), person_idH := max(fthr$person_idH, na.rm = TRUE) + 1:.N]

## Carry person_idH:
fthr[, father_id := NULL]
tpr <- copy(unique(fthr[!is.na(father_idH), .(father_idH, person_idH)]))
fthr[, father_idH := NULL]
fthr <- tpr[fthr, on = "person_idH"]

## Check no repetition within year:
fthr[, ob := .N, by = c("person_idH", "Year")]
if(any(fthr[, ob > 1]))
  stop("More than one obs per (individual, Year)! Tracking_Generations.r")

## Check no repetition within year:
fthr[, ob := .N, by = c("person_id", "Year")]
if(any(fthr[, ob > 1]))
  stop("More than one obs per (individual, Year)! Tracking_Generations.r")

fwrite(fthr, file.path(fldr, "Homogenised_ID.r"))

##############################################################################################
## Get IDs of grand fathers, siblings and save family tree  
## Data created: Family_Tree

dat <- read_feather(file.path(fldr, "links_withHHmen"), c("person_id", "hh_id", "Year", "Age", "Sname", "Pname"))
setDT(dat)

homo <- fread(file.path(fldr, "Homogenised_ID.r"))

dat <- homo[dat, on = c("person_id", "Year")]

if(nrow(dat[is.na(person_idH)]) > 0)
  stop("Missing homogeneized ID! Tracking_Generations2.r")

## Find the id of the father of the father (first finds the entry of
## the father using father_idH and matchNAown, then gets the father_idH
## of that entry) 
dat[, gFather_idH := father_idH[matchNAown(father_idH, person_idH)]]
dat[gFather_idH == father_idH, gFather_idH := NA]

## Carry over missing gfather_id:
setorder(dat, person_idH, Year)
dat[, gFather_idH := as.numeric(gFather_idH)]
suppressWarnings(dat[, gFather_idH := max(gFather_idH, na.rm = TRUE), by = person_idH])
dat[is.infinite(gFather_idH), gFather_idH := NA]

## Generate sibling id using father_idH
setorder(dat, father_idH)
dat[!is.na(father_idH), sibling_idH := (1:.N == 1) * 1., by = father_idH]
dat[!is.na(father_idH), sibling_idH := cumsum(sibling_idH)]

## Generate unique family_id:
setorder(dat, person_id, Year)
dat[, family_id := hh_id[1], by = person_idH]

## Family_id defined by grandfather
tmp <- copy(unique(dat[, .(family_id2 = family_id, gFather_idH = person_idH)]))
dat <- tmp[dat, on = "gFather_idH"]
dat[!is.na(gFather_idH), family_id := family_id2]
dat[, family_id2 := NULL]

## If there is no grandfather Family_id defined by father
tmp <- unique(dat[, .(family_id2 = family_id, father_idH = person_idH)])
dat <- tmp[dat, on = "father_idH"]
dat[is.na(gFather_idH) & !is.na(father_idH), family_id := family_id2]
dat[, family_id2 := NULL]
##Otherwise individual is his own family

## Save the family tree:
dat <- suppressWarnings(dat[, .(year16 = trunc(mean(Year - as.numeric(Age) + 16, na.rm = TRUE))),
                            by = c("person_idH", "family_id",
                                   "father_idH", "gFather_idH", 
                                   "sibling_idH")])
setorder(dat, person_idH)

dat[, ":="(ob = .N), by = person_idH]
if(any(dat[, ob > 1]))
  stop("person_idH is not unique!")
dat[, ob := NULL]

write_feather(dat, file.path(fldr, "Family_Tree"))
rm(dat)
gc()

##########################################################################
### Add Occupation             
### Data created: Father_Set

## A) Get father occupation closest to when individiual age 16:
fthr <- read_feather(file.path(fldr, "links_withHHmen"), c("person_id", 
                                                           "Year", "Age",
                                                           "hisclass", "Sname",
                                                           "lfocc_lab",
                                                           "Occ", "Occode",
                                                           "HISCO"))
setDT(fthr)
fthr <- homo[, .(person_idH, Year, person_id)][fthr, on = c("person_id", "Year")]
if(nrow(fthr[is.na(person_idH)]))
  stop("Missign homogenised id!Tracking Generations2.r")
fthr[, person_id := NULL]

## Drop if unknown occupation:
fthr <- fthr[hisclass > 0]
names(fthr)[names(fthr) == "person_idH"] <- "idH"
names(fthr) <- paste0("father_", names(fthr))

write_feather(fthr, file.path(fldr, "Father_Set"))

## Get observation of father occupation, if multiple match the one closest to year when individual was 16:
dat <- read_feather(file.path(fldr, "Family_Tree"))
setDT(dat)
fthr[, Year := father_Year]
fthr[, Year := NULL]

dat <- fthr[dat, on = "father_idH"]

dat[, dist_year16 := abs(year16 - father_Year)]
setorder(dat, person_idH, dist_year16, father_Year)
dat[, keep := 1:.N == 1, by = person_idH]
dat <- dat[which(keep)]
dat[, keep := NULL]

## B) Get grand father occupation closest to when individiual's father was age 16:
vars <- c(names(dat)[grep("father_[^id]", names(dat))], "person_idH", "dist_year16")
gfthr <- copy(dat[father_hisclass > 0, (vars), with = FALSE])
names(gfthr)[grep("father_", names(gfthr))] <- paste0("g", names(gfthr)[grep("father_", names(gfthr))])

names(gfthr)[names(gfthr) == "person_idH"] <- "father_idH"
names(gfthr)[names(gfthr) == "dist_year16"] <- "father_dist_year16"

dat <- gfthr[dat, on = "father_idH"]

## C) Get own occupation information in every census:
tmp <- read_feather(file.path(fldr, "links_withHHmen"), c("person_id", 
                                                          "Year", "Age",
                                                          "hisclass", "Sname",
                                                          "lfocc_lab",
                                                          "Occ", "Occode",
                                                          "HISCO"))
setDT(tmp)
tmp[, Age := as.numeric(Age)]
tmp <- homo[, .(person_idH, Year, person_id)][tmp, on = c("person_id", "Year")]
if(nrow(tmp[is.na(person_idH)]))
  stop("Missing homogenised id!Tracking Generations2.r")

dat <- tmp[dat, on = "person_idH"]
rm(tmp)
gc()

## D) Get Occupation of Older brother:
##Generate within family sibling ID:
setorder(dat, sibling_idH, Year, -Age)
dat[!is.na(Age), sib_id := (1:.N) * 1., by = c("Year", "sibling_idH")]
dat[is.na(sibling_idH), sib_id := -Inf]

##Get occupation and age of older brother:
setorder(dat, sibling_idH, Year, -Age)
dat[, ":="(sib_Age = Age[sib_id == 1],
           sib_lfocc_lab = lfocc_lab[sib_id == 1],
           sib_hisclass = hisclass[sib_id == 1],
           sib_Occ = Occ[sib_id == 1],
           sib_Occode = Occode[sib_id == 1],
           sib_Sname = Sname[sib_id == 1]),
    by = c("Year", "sibling_idH")]

##Older sibling is pointing to himself
dat[sib_id == 1, c("sib_Age",
                   "sib_lfocc_lab",
                   "sib_hisclass",
                   "sib_Occ",
                   "sib_Occode",
                   "sib_Sname") := NA]

##########################################################################
### KEEP ONLY THOSE WITHIN LONDON
### Data created: "JoinedDataset"

mapping <- read_feather(file.path(fldr, "links_withHHmen"), columns = c("person_id", "ConParID", "Year"))
setDT(mapping)
dat <- mapping[dat, on = c("person_id", "Year")]
gc()
dat[, ConParID := as.numeric(ConParID)]

dat[!is.na(father_Sname), dist_FSuname := stringdist(Sname, father_Sname, method = "jw", p = .1)]
dat[!is.na(gfather_Sname), dist_GFSuname := stringdist(Sname, gfather_Sname, method = "jw", p = .1)]
dat[!is.na(sib_Sname), dist_SibSuname := stringdist(Sname, sib_Sname, method = "jw", p = .1)]

dat[, Age := as.numeric(Age)]

##Read London Parish:
mapping <- fread(file.path(fldr, "London_Conparid"))

## Get only those living in London and homogeneous conparid, conparid_std:
dat <- mapping[dat, on = c("conparid==ConParID", "Year"), nomatch = 0]
gc()

### Save
write_feather(dat, file.path(fldr, "JoinedDataset"))

##############################################################################################
### MATCH TO ADDRESS
### Data created: 1881_1851_Matched_Addresses

##Get London Maps
uk51 <- rgdal::readOGR(dsn = fldr_napp, layer="London51")

uk01 <- rgdal::readOGR(dsn = fldr_napp,layer="London01")

####Get street names in 1881:
street <- rgdal::readOGR(dsn = fldr_napp, 
                         layer = "LL_PL_PA_POINTS_1881",
                         use_iconv = TRUE,encoding = "utf-8",
                         integer64 = "warn.loss")
street <- street[abs(street@coords[, 1]) < 1e6 & abs(street@coords[, 2]) < 1e6, ]

##Get streets that lie inside and outside the 01 polygon:
proj4string(street) <- proj4string(uk01)
##1901 map
street@data$ConParID01 <- over(street, uk01)
##1851 map
street@data$ConParID51 <- over(street, uk51)

##Those that lie outside:
##1901 map
pout <- street[which(is.na(street@data$ConParID01)), ]
street@data$ConParID01[is.na(street@data$ConParID01)] <- sapply(1:length(pout), function(y) { 
  uk01[which.min((sapply(1:length(uk01), function(x) rgeos::gDistance(pout[y,], uk01[x,])))), ]@data$conparid
})
##1851 map
pout <- street[which(is.na(street@data$ConParID51)), ]
street@data$ConParID51[is.na(street@data$ConParID51)] <- sapply(1:length(pout), function(y) { 
  uk51[which.min((sapply(1:length(uk51), function(x) rgeos::gDistance(pout[y,], uk51[x,])))), ]@data$conparid
})

##Do a mapping from ParID in 1911 into ConParID in 1881  
dat <- read_feather(file.path(fldrtmp, "links_withHHmen"), columns = c("person_id", "ConParID", "Year",
                                                                       "Address", "SubDist", "Parish",
                                                                       "ParID"))
setDT(dat)

dat[, ConParID := as.numeric(ConParID)]
dat <- dat[Year %in% c(1851, 1881) & !is.na(ConParID)]
##dat <- dat[Year %in% c(1911, 1881) & !is.na(ConParID)]

##Remove all characters before numbers
dat[, Address2 := .(paste0(" ", gsub(".*[0-9](.+?)$", "\\1", Address, " ")))]

dat <- unique(dat[, .(Address2 = trimws(toupper(Address2)),
                      Address, ConParID, Year)])
gc()

## Match for 1851
for(y in c(1881, 1851)) { 
  unParID <- unique(unlist(street@data$ConParID51))
  for(i in unParID) {
    ##If there is more than one street with the same name it is impossible to know
    ##which one to match:
    slct <- (unlist(street@data$ConParID51) == i)
    
    names <- data.table(stname = toupper(as.character(street@data$P_NAME[slct])),
                        ConParID51 = street@data$ConParID51[slct,],
                        stid = street@data$OBJECTID_1[slct])
    
    names[, ob := .N, by = stname]
    names <- names[ob == 1]
    
    dat[ConParID == i & Year == y, match := findClosest(Address2, names$stname),
        by = Address]
    
    dat[ConParID == i & Year == y & !is.na(match),
        ":="(stname = names$stname[match],
             stid = names$stid[match],
             ConParID51 = names$ConParID51[match])]
    
    dat[, match := NULL]
  }
}

dat[, stname_distance := stringdist(Address2, stname, method = "jw", p = .1)]

fwrite(dat, "1881_1851_Matched_Addresses")

##############################################################################################
### GET ALL DATA TOGETHER
# JoinedDataset: links_withHHmen for London only
# links_withHHmen: linked data with all relevant variables 
# Homogeneised ID: Homogenised_ID.r
# Address: 1881_1851_Matched_Addresses
# Data created: Dataset_1851_1881

dat <- read_feather(file.path(fldr, "JoinedDataset"))
setDT(dat)

dat <- dat[Year %in% c(1881, 1851)]

## Drop Grand father info:
vars <- names(dat)[grep("father", tolower(names(dat)))]
vars <- vars[grep("[^father_idH]", vars)]
dat[, (vars) := NULL]

## Merge father occupation in current census
fthr <- read_feather(file.path(fldrtmp, "links_withHHmen"), c("person_id", 
                                                              "Year", "Age",
                                                              "hisclass", "Sname",
                                                              "lfocc_lab",
                                                              "Occ", "Occode"))
setDT(fthr)

names(fthr)[names(fthr) == "person_id"] <- "id"
names(fthr) <- paste0("father_", names(fthr))
homo <- fread(file.path(fldrtmp, "Homogenised_ID.r"))[, .(father_idH = person_idH,
                                                          father_id = person_id,
                                                          father_Year = Year)]
fthr <- homo[fthr, on = c("father_id", "father_Year")]
fthr[, father_id := NULL]
fthr[, Year := father_Year]

dat <- fthr[dat, on = c("father_idH", "Year")]

## Occupation previous census:
fthr[, Year := NULL]
names(fthr)[names(fthr) != "father_idH"] <- paste0("prev_", names(fthr)[names(fthr) != "father_idH"])
fthr[, Year := prev_father_Year + 30]

dat <- fthr[dat, on = c("father_idH", "Year")]

write_feather(dat, file.path(fldr, "Dataset_1851_1881_SubSetVars"))

tmp <- read_feather(file.path(fldr, "links_withHHmen"))
setDT(tmp)
vars <- names(dat)[!(names(dat) %in% c("person_id", "Year"))]
vars <- names(tmp)[!(names(tmp) %in% c(vars, "lfocc_lab.1"))]

## Match all original individual variabels
dat <- tmp[, (vars), with = FALSE][dat, on = c("person_id", "Year")]

dat[, Address := trimws(Address)]
dat[, ConParID := as.numeric(ConParID)]
map <- fread(file.path(fldr, "1881_1851_Matched_Addresses"))
dat <- map[dat, on = c("Address", "Year", "ConParID")]

write_feather(dat, file.path(fldr, "Dataset_1851_1881"))
